Projekt 3: Financial Data

Descriptive Statistics and Simple Models

Present the data, estimate the parameters in a normal model, and asses if the normal model is appropriate.

D <- read.table("finance_data.csv", header=TRUE, sep=";", 
                as.is=TRUE)
## Dimensions of D (number of rows and columns)
dim(D)
## [1] 454   2
##  Column/variable names
names(D)
## [1] "time" "SLV"
## The first rows/observations
head(D)
## The last rows/observations
tail(D)
## Selected summary statistics
summary(D)
##      time                SLV           
##  Length:454         Min.   :-0.238893  
##  Class :character   1st Qu.:-0.026350  
##  Mode  :character   Median : 0.002226  
##                     Mean   : 0.001468  
##                     3rd Qu.: 0.033122  
##                     Max.   : 0.267308
## Another type of summary of the dataset
str(D)
## 'data.frame':    454 obs. of  2 variables:
##  $ time: chr  "2006-5-5" "2006-5-12" "2006-5-19" "2006-5-26" ...
##  $ SLV : num  0.01376 0.03286 -0.12863 0.00556 -0.04578 ...
D$date <- as.Date(D$time)
D$year <- year(D$date)

ggplot(D, aes(x = date, y = SLV)) + geom_point()

# ggplot(D, aes(x = SLV)) +
#   geom_histogram(aes(y = ..density..), color = 'black')

par <- nlminb(start = c(1,1), objective = testDistribution,
              distribution = "normal",
              x = D$SLV)
ggplot(D)+
  geom_histogram(aes(x = SLV, y= ..density..,), color='black') +#color, fill
  stat_function(fun = dnorm, n = dim(D)[1], args = list(mean = par$par[1], sd = par$par[2]), color='red')

# plot(ecdf(D$SLV), verticals = T)
# xseq <- seq(0.9*min(D$SLV), 1.1*max(D$SLV), length.out=100)
# lines(xseq, pnorm(xseq, mean(D$SLV), sd(D$SLV)), col='red')
#plot(xseq, pnorm(xseq, mean(D$SLV), sd(D$SLV)), col='red')
qqnorm(D$SLV)
qqline(D$SLV)

Hypothesize a model that could fit the data better (Hint: consider tail probabilities), and compare with the normal model estimated above

lcauchyFUNC <- function(p, data){
  x0 <- p[1] #location R
  gam <- p[2] #scale R > 0
  return(-sum(dcauchy(x = data, location = x0, scale = gam, log = T)))
}
llstFUNC <- function(p, data){ #location-scale t-distribution
  return(-sum(dlst(x = data, df = p[1], mu = p[2], sigma = p[3], log = T)))
}
lgnFUNC <- function(p, data){ #symmetric generalized normal dist
  return(-sum(dgnorm(x = data, mu = p[1], alpha = p[2], beta = p[3], log = T)))
}
lsnFUNC <- function(p, data){ #skewed normal dist
  return(-sum(dsn(x = data, xi = p[1], omega = p[2], alpha = p[3], log = T)))
}

par.cauchy <- nlminb(start = c(0,1), objective = lcauchyFUNC, data = D$SLV)
par.lst <- nlminb(start = c(1,1,1), objective = llstFUNC, data = D$SLV)
par.gn <- nlminb(start = c(1,1,1), objective = lgnFUNC, data = D$SLV)
## Not defined for negative values of alpha and/or beta.
## Not defined for negative values of alpha and/or beta.
## Not defined for negative values of alpha and/or beta.
## Not defined for negative values of alpha and/or beta.
par.sn <- nlminb(start = c(1,1,1), objective = lsnFUNC, data = D$SLV)

ggplot(D)+
  geom_histogram(aes(x = SLV, y= ..density..,), color='black') + #color, fill
  stat_function(fun = dnorm, n = dim(D)[1], args = list(mean = par$par[1], sd = par$par[2]), aes(colour = "norm")) +
  stat_function(fun = dcauchy, n = dim(D)[1], args = list(location = par.cauchy$par[1],
                                                          scale = par.cauchy$par[2]), aes(colour = "cauchy")) +
  stat_function(fun = dlst, n = dim(D)[1], args = list(df = par.lst$par[1], mu = par.lst$par[2],
                                                       sigma = par.lst$par[3]), aes(colour = "lst")) +
  stat_function(fun = dgnorm, n = dim(D)[1], args = list(mu = par.gn$par[1], alpha = par.gn$par[2],
                                                         beta = par.gn$par[3]), aes(colour = "gnorm")) +
  stat_function(fun = dsn, n = dim(D)[1], args = list(xi = par.sn$par[1], omega = par.sn$par[2],
                                                      alpha = par.sn$par[3]), aes(colour = "sn")) +
  scale_colour_manual(values = c("blue", "red", "yellow", "black", "grey"))+
  labs(colour = "Distribution")

AIC.norm <- -2 * sum(dnorm(x = D$SLV, mean = par$par[1], sd = par$par[2], log = T))
+ 2 * length(par$par)
## [1] 4
AIC.cauchy <- -2 * sum(dcauchy(x = D$SLV, location = par.cauchy$par[1],
                               scale = par.cauchy$par[2], log = T)) + 2 * length(par.cauchy$par)
AIC.lst <- -2 * sum(dlst(x=D$SLV, df = par.lst$par[1], mu = par.lst$par[2], sigma = par.lst$par[3], log = T)) + 2 * length(par.lst$par)
AIC.gn <- -2 * sum(dgnorm(x=D$SLV, mu = par.gn$par[1], alpha = par.gn$par[2], beta = par.gn$par[3], 
                          log = T)) + 2 * length(par.gn$par)
AIC.sn <- -2 * sum(dsn(x=D$SLV, xi = par.sn$par[1], omega = par.sn$par[2], alpha = par.sn$par[3], 
                       log = T)) + 2 * length(par.sn$par)

round(rbind(AIC.norm, AIC.cauchy, AIC.lst, AIC.gn, AIC.sn), digits=5)
##                 [,1]
## AIC.norm   -1464.000
## AIC.cauchy -1363.414
## AIC.lst    -1490.234
## AIC.gn     -1480.391
## AIC.sn     -1458.000
n <- 100000 
par(mfrow=c(2,2)) #comparing by means of generating histograms with n = 100000 points for each of the distributions
hist(D$SLV)
hist(rnorm(n, mean = par$par[1], sd = par$par[2]))
hist(rlst(n, df = par.lst$par[1], mu = par.lst$par[2], sigma = par.lst$par[3]))
hist(rgnorm(n, mu = par.gn$par[1], alpha = par.gn$par[2], beta = par.gn$par[3]))

#hist(rsn(n, xi = par.sn$par[1], omega = par.sn$par[2], alpha = par.sn$par[3]))

Present the final model (i.e. relevant keynumbers for the estimates)

First we have the profile likelihoods of the two parameters of the normal distribution optimized at the start of project 3. This is to enable comparison with the profile likelihoods of the chosen distribution which is the generalized normal distribution. Likelihood based and Wald CIs are also calculated but are not printed till later.

alpha <- 0.05
c <- exp(-0.5 * qchisq(1-alpha, df = 1))
par(mfrow=c(1,2))
#####Likelihood based CI for location-scale t-distribution
mle.lst <- par.lst$par

#lstFUNC w/ NLL = T, negative log-likelihood using p's

#lstFUNC w/ NLL = F, log-likelihood using p's

lstFUNC <- function(df, mu, sigma, data, log = F){
  if(!log){
    return(prod(dlst(x = data, df = df, mu = mu, sigma = sigma, log = F) / 2)) # to avoid inf values
  } else {
    return(sum(dlst(x = data, df = df, mu = mu, sigma = sigma, log = T)))
  }
}  


CIfun.lst <- function(y, data, p = "df"){##### T from mean, F for sigma
  if(p == "df"){
    sum(dlst(x = data, df = mle.lst[1], mu = mle.lst[2], sigma = mle.lst[3], log = T)) -
      sum(dlst(x = data, df = y, mu = mle.lst[2], sigma = mle.lst[3], log = T)) - 
      0.5 * qchisq(1-alpha, df = 1)
  } else if(p == "mu") {
    sum(dlst(x = data, df = mle.lst[1], mu = mle.lst[2], sigma = mle.lst[3], log = T)) -
      sum(dlst(x = data, df = mle.lst[1], mu = y, sigma = mle.lst[3], log = T)) - 
      0.5 * qchisq(1-alpha, df = 1)
  } else { #p == "sigma"
    sum(dlst(x = data, df = mle.lst[1], mu = mle.lst[2], sigma = mle.lst[3], log = T)) -
      sum(dlst(x = data, df = mle.lst[1], mu = mle.lst[2], sigma = y, log = T)) - 
      0.5 * qchisq(1-alpha, df = 1)
  }
}
###PROFILE likelihoods
par(mfrow = c(1,3))
#df
dfs.lst <- seq(mle.lst[1]-2.5, mle.lst[1]+4.25, by = 0.0001)
df.lst <- sapply(X = dfs.lst, FUN = lstFUNC, mu = mle.lst[2], sigma = mle.lst[3], data = D$SLV, log = F)
plot(dfs.lst, df.lst/max(df.lst), col = 1, type = "l", xlab = "df",
     main = "Parameter value for df for location-scale t-distribution model of SLV")
CI.df.lst <- c(uniroot(f=CIfun.lst, interval = c(min(dfs.lst), mle.lst[1]), data = D$SLV, p = "df")$root,
               uniroot(f=CIfun.lst, interval = c(mle.lst[1], max(dfs.lst)), data = D$SLV, p = "df")$root)
lines(range(dfs.lst), c*c(1,1), col = 2)
#mu
mus.lst <- seq(mle.lst[2]-0.01, mle.lst[2]+0.01, by = 0.00001)
mu.lst <- sapply(X = mus.lst, FUN = lstFUNC, df = mle.lst[1], sigma = mle.lst[3], data = D$SLV, log = F)
plot(mus.lst, mu.lst/max(mu.lst), col = 1, type = "l", xlab = expression(paste(mu)),
     main = "Parameter value for location of location-scale t-distribution model of SLV")
CI.mu.lst <- c(uniroot(f = CIfun.lst, interval = c(min(mus.lst), mle.lst[2]), data = D$SLV, p = "mu")$root,
              uniroot(f = CIfun.lst, interval = c(mle.lst[2], max(mus.lst)), data = D$SLV, p = "mu")$root)
lines(range(mus.lst), c*c(1,1), col = 2)
#sigma
sigmas.lst <- seq(mle.lst[3]-0.01,mle.lst[3]+0.01, by = 0.0001)
sigma.lst <- sapply(X = sigmas.lst, FUN = lstFUNC, df = mle.lst[1], mu = mle.lst[2], data = D$SLV, log = F)
plot(sigmas.lst, sigma.lst/max(sigma.lst), col = 1, type = "l", xlab = expression(paste(sigma)),
     main = "Parameter value for scale for generalized normal model of SLV")
CI.sigma.lst <- c(uniroot(f = CIfun.lst, interval = c(min(sigmas.lst), mle.lst[3]), data = D$SLV, p = "sigma")$root,
                 uniroot(f = CIfun.lst, interval = c(mle.lst[3], max(sigmas.lst)), data = D$SLV, p = "sigma")$root)
lines(range(sigmas.lst), c*c(1,1), col = 2)

#Wald CIs
n <- dim(D)[1]
H.df.lst <- hessian(lstFUNC, mle.lst[1], mu = mle.lst[2], sigma = mle.lst[3], data = D$SLV, log = T)
V.df.lst <- as.numeric(-1/H.df.lst)
H.mu.lst <- hessian(lstFUNC, mle.lst[2], df = mle.lst[1], sigma = mle.lst[3], data = D$SLV, log = T)
V.mu.lst <- as.numeric(-1/H.mu.lst)
H.sigma.lst <- hessian(lstFUNC, mle.lst[3], df = mle.lst[1], mu = mle.lst[2], data = D$SLV, log = T)
V.sigma.lst <- as.numeric(-1/H.sigma.lst)
wald.df.lst <- mle.lst[1] + c(-1,1) * qnorm(1-alpha/2) * sqrt(V.df.lst)
wald.mu.lst <- mle.lst[2] + c(-1,1) * qnorm(1-alpha/2) * sqrt(V.mu.lst)
wald.sigma.lst <- mle.lst[3] + c(-1,1) * qnorm(1-alpha/2) * sqrt(V.sigma.lst)

round( rbind(CI.df.lst, wald.df.lst, CI.mu.lst, wald.mu.lst, CI.sigma.lst, wald.sigma.lst), digits=5);round( rbind(mle.lst), digits=5)
##                    [,1]    [,2]
## CI.df.lst       4.39309 9.59388
## wald.df.lst     3.85975 8.71193
## CI.mu.lst      -0.00157 0.00663
## wald.mu.lst    -0.00158 0.00664
## CI.sigma.lst    0.03656 0.04278
## wald.sigma.lst  0.03639 0.04261
##            [,1]    [,2]   [,3]
## mle.lst 6.28584 0.00253 0.0395
temp1 <- paste("df == ", round(mle.lst[1], digits=2))
temp2 <- paste("mu == ", round(mle.lst[2], digits=5))
temp3 <- paste("sigma == ", round(mle.lst[3], digits=4))

ggplot(D)+
  geom_histogram(aes(x = SLV, y= ..density..,), color='black') + #color, fill
  stat_function(fun = dlst, n = dim(D)[1], args = list(df = par.lst$par[1], mu = par.lst$par[2],
                                                         sigma = par.lst$par[3]), color = 'red') +
  annotate( "text", x = 2.7/5*max(D$SLV), y = c(10.5, 10.0, 9.5), label = c(temp1,temp2,temp3), parse = T  ) +
  ggtitle("Location-scale t-distribution and distribution of the weekly returns")

qqlst <- function (y, line = FALSE, ...)
{
  y <- y[!is.na(y)]
  n <- length(y)
  x <- qlst(c(1:n)/(n + 1), df=mle.lst[1])
  m <- mean(y)
  ylim <- c(min(y), max(y))
  qqplot(x, y, xlab = "location-scale t-distribution plotting position", ylim = ylim, 
         ylab = "Ordered sample", ...)
  if (line) 
    abline(0, m, lty = 2)
  invisible()
}
par(mfrow=c(1,2))
qqnorm(D$SLV)
qqline(D$SLV)
qqlst(D$SLV)
qqline(D$SLV)